library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.5.2 ✔ fma 2.5
## ✔ forecast 8.24.0 ✔ expsmooth 2.3
##
library(ggplot2)
library(urca)
data <- read.csv("Data/US_Monthly_Air_Passengers00_19.csv")
names(data) <- tolower(names(data))
colnames(data)
## [1] "sum_passengers" "airline_id" "carrier_name"
## [4] "origin" "origin_city_name" "origin_state_abr"
## [7] "origin_state_nm" "origin_country" "origin_country_name"
## [10] "dest" "dest_city_name" "dest_state_abr"
## [13] "dest_state_nm" "dest_country" "dest_country_name"
## [16] "year" "month"
dataus <- subset(data, origin_country == "US")
datausd <- subset(dataus, dest_country == "US")
nc <- subset(datausd, origin_state_abr == "NC" | dest_state_abr == "NC")
colnames(nc)
## [1] "sum_passengers" "airline_id" "carrier_name"
## [4] "origin" "origin_city_name" "origin_state_abr"
## [7] "origin_state_nm" "origin_country" "origin_country_name"
## [10] "dest" "dest_city_name" "dest_state_abr"
## [13] "dest_state_nm" "dest_country" "dest_country_name"
## [16] "year" "month"
dim(nc)
## [1] 279509 17
Checking the cumulative percentage for the total of passenger on North Carolina to determine which cities to focus on
# Calculate total passengers for each city
city_passengers <- nc %>%
group_by(city = ifelse(origin_state_abr == "NC", origin_city_name, dest_city_name)) %>%
summarise(total_passengers = sum(sum_passengers)) %>%
arrange(desc(total_passengers))
# Calculate cumulative percentage of passengers
city_passengers <- city_passengers %>%
mutate(cumulative_percent = cumsum(total_passengers) / sum(total_passengers))
head(city_passengers)
We can see that Charlotte and Raleigh make up over 91.7% of the passengers in North Carolina.
Filtering to only include Charlotte and Raleigh.
df <- subset(nc, origin_city_name == "Raleigh/Durham, NC" | origin_city_name == "Charlotte, NC" | dest_city_name == "Raleigh/Durham, NC" | dest_city_name == "Charlotte, NC")
dim(df)
## [1] 217683 17
217,683 flights.
Checking missing values
missing_values <- colSums(is.na(df))
print(missing_values)
## sum_passengers airline_id carrier_name origin
## 0 5 0 0
## origin_city_name origin_state_abr origin_state_nm origin_country
## 0 0 0 0
## origin_country_name dest dest_city_name dest_state_abr
## 0 0 0 0
## dest_state_nm dest_country dest_country_name year
## 0 0 0 0
## month
## 0
There are no relevant missing values in the data. Just 5 for airline_id, and is not going to affect our ts object.
Top 15 Carriers by Number of Flights
top_carriers <- df %>%
group_by(carrier_name) %>%
summarise(number_of_flights = n()) %>%
top_n(15, number_of_flights)
top_carriers <- top_carriers[order(-top_carriers$number_of_flights), ]
ggplot(top_carriers, aes(x = reorder(carrier_name, -number_of_flights), y = number_of_flights, fill = number_of_flights)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "lightblue", high = "blue") + # Gradient from light to dark blue
scale_y_continuous(breaks = seq(0, max(top_carriers$number_of_flights), by = 2000)) + # Custom y-axis breaks
xlab("Carrier Name") +
ylab("Number of Flights") +
ggtitle("Top 15 Carriers by Number of Flights") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_line(color = "gray", linetype = "dashed"), # Add dashed major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines to reduce clutter
panel.background = element_blank(), # Remove panel background
panel.border = element_rect(color = "black", fill = NA)) + # Add border around the plot
labs(fill = "Total Flights")
Top 15 Destinations by Number of Flights
top_destinations <- df %>%
group_by(dest_city_name) %>%
summarise(number_of_flights = n()) %>%
top_n(15, number_of_flights)
top_destinations <- top_destinations[order(-top_destinations$number_of_flights), ]
ggplot(top_destinations, aes(x = reorder(dest_city_name, -number_of_flights), y = number_of_flights),
fill = dest_city_name) +
geom_bar(stat = "identity") +
xlab("Destination City Name") +
ylab("Number of Flights") +
ggtitle("Top 15 Destinations by Number of Flights") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_line(color = "gray", linetype = "dashed"), # Add dashed major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines to reduce clutter
panel.background = element_blank(), # Remove panel background
panel.border = element_rect(color = "black", fill = NA)) # Add border around the plot
Since our top destinations are Charlotte, NC and Raleigh/Durham, NC, we
decide to visualize the top destinations with origin_city Charlotte, NC
or Raleigh/Durham, NC
Top 15 Destinations by Number of Flights for Charlotte, NC and Raleigh/Durham, NC
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Prepare data for Charlotte, NC
top_destinations_charlotte <- df %>%
filter(origin_city_name == "Charlotte, NC") %>%
group_by(dest_city_name) %>%
summarise(number_of_flights = n()) %>%
top_n(15, number_of_flights) %>%
arrange(desc(number_of_flights))
# Prepare data for Raleigh/Durham, NC
top_destinations_raleigh <- df %>%
filter(origin_city_name == "Raleigh/Durham, NC") %>%
group_by(dest_city_name) %>%
summarise(number_of_flights = n()) %>%
top_n(15, number_of_flights) %>%
arrange(desc(number_of_flights))
# Identify common destinations
common_destinations <- intersect(top_destinations_charlotte$dest_city_name, top_destinations_raleigh$dest_city_name)
# Plot for Charlotte with conditional coloring
plot_charlotte <- ggplot(top_destinations_charlotte, aes(x = reorder(dest_city_name, -number_of_flights), y = number_of_flights, fill = dest_city_name %in% common_destinations)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("TRUE" = "blue4", "FALSE" = "gray")) +
xlab("Destination City Name") +
ylab("Number of Flights") +
ggtitle("Top 15 Destinations from Charlotte, NC") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9), # Smaller tick labels on the x-axis
axis.text.y = element_text(size = 9), # Smaller tick labels on the y-axis
plot.title = element_text(size = 10), # Smaller plot title
legend.position = "none")
# Plot for Raleigh/Durham with conditional coloring
plot_raleigh <- ggplot(top_destinations_raleigh, aes(x = reorder(dest_city_name, -number_of_flights), y = number_of_flights, fill = dest_city_name %in% common_destinations)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("TRUE" = "red3", "FALSE" = "gray")) +
xlab("Destination City Name") +
ylab("Number of Flights") +
ggtitle("Top 15 Destinations from Raleigh/Durham, NC") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9), # Smaller tick labels on the x-axis
axis.text.y = element_text(size = 9), # Smaller tick labels on the y-axis
plot.title = element_text(size = 10), # Smaller plot title
legend.position = "none")
# Combine both plots in a grid layout
grid.arrange(plot_charlotte, plot_raleigh, ncol = 2)
Monthly Passenger Traffic Heatmap by Year
heatmap_data <- df %>%
group_by(year, month) %>%
summarise(sum_passengers = sum(sum_passengers, na.rm = TRUE), .groups = 'drop')
# Create the heatmap
ggplot(heatmap_data, aes(x = factor(month), y = factor(year), fill = sum_passengers)) +
geom_tile(color = "white") + # Adds the tiles for the heatmap
scale_fill_gradient(low = "yellow", high = "purple", name = "Sum Passengers") + # Color gradient
labs(title = "Monthly Passenger Traffic Heatmap by Year", x = "Month", y = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Improve x-axis labels visibility
### FORECASTING
##Creating the time series object
Checking if there any months no register in a year
# Group by year and summarize to count unique months per year
monthly_data_check <- df %>%
group_by(year) %>%
summarise(month_count = n_distinct(month)) %>%
mutate(full_year_data = if_else(month_count == 12, "Yes", "No"))
# View the results
print(monthly_data_check)
## # A tibble: 20 × 3
## year month_count full_year_data
## <int> <int> <chr>
## 1 2000 12 Yes
## 2 2001 12 Yes
## 3 2002 12 Yes
## 4 2003 12 Yes
## 5 2004 12 Yes
## 6 2005 12 Yes
## 7 2006 12 Yes
## 8 2007 12 Yes
## 9 2008 12 Yes
## 10 2009 12 Yes
## 11 2010 12 Yes
## 12 2011 12 Yes
## 13 2012 12 Yes
## 14 2013 12 Yes
## 15 2014 12 Yes
## 16 2015 12 Yes
## 17 2016 12 Yes
## 18 2017 12 Yes
## 19 2018 12 Yes
## 20 2019 12 Yes
Notice that all years have the 12 months.
time series object
# Aggregate data to monthly frequency
monthly_df <- df %>%
group_by(year, month) %>%
summarize(sum_passengers = sum(sum_passengers), .groups = 'drop')
# Create time series object
ts_df <- ts(monthly_df$sum_passengers,
start = c(min(monthly_df$year),
min(monthly_df$month)),
frequency = 12)
Visualizing the ts_df, Monthly Total of Passengers
autoplot(ts_df) +
xlab("Year") +
ylab("Total of Passengers") +
ggtitle("Monthly total of Passengers") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Train test split, 90% data is for train
N <- length(ts_df)
train_ratio <- 0.9
T <- floor(train_ratio*N)
S <- N - T
train <- head(ts_df, T)
test <- tail(ts_df, S)
Visualizing the Train test split
autoplot(train) +
autolayer(test, series = "Test Set") +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("Train Test Split") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Checking the seasonality of our ts object
ggseasonplot(train)
##Performing a decomposition Our goal is to understand the underlying
components of our train set ts object.
train_stl <- stl(train, s.window = "periodic", t.window = 12)
autoplot(train_stl)
##Box.Cox Transform
We want to pick the lambda that stabilizes the variance the best for apply the box-cox transform with la first in our forecast functions.
la <- BoxCox.lambda(train)
la
## [1] 1.127672
#Naive forecast
naive_forecast <- naive(train, lambda=la, h = S)
accuracy(naive_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 11936.57 304374.4 221841.3 -0.1478918 7.220260 1.196339
## Test set 300823.17 507856.0 460717.3 5.7301006 9.871906 2.484541
## ACF1 Theil's U
## Training set -0.2782278 NA
## Test set 0.4046188 1.112787
naive_rmse <- accuracy(naive_forecast, test)["Test set", "RMSE"]
#Seasonal Naive forecast
snaive_forecast <- snaive(train, lambda=la, h = S)
accuracy(snaive_forecast, test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 108931.4 232713.1 185433.5 2.986508 6.172547 1.000000 0.6846912
## Test set 322653.0 393043.9 332285.4 6.792431 7.040486 1.791938 0.5669365
## Theil's U
## Training set NA
## Test set 0.8649315
snaive_rmse <- accuracy(snaive_forecast, test)["Test set", "RMSE"]
#Drift forecast
drift_forecast <- rwf(train, lambda=la, drift = TRUE, h = S)
accuracy(drift_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 138.7344 304057.0 222624.6 -0.5251443 7.250946 1.200563
## Test set 159096.7278 398120.4 367214.6 2.7260989 8.052901 1.980303
## ACF1 Theil's U
## Training set -0.2782679 NA
## Test set 0.3003130 0.8787021
drift_rmse <- accuracy(drift_forecast, test)["Test set", "RMSE"]
#Average forecast
mean_forecast <- meanf(train, lambda=la, h = S)
accuracy(mean_forecast, test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -10522.44 728628.1 619427 -6.068036 20.95812 3.340426 0.8980106
## Test set 1277934.02 1341841.4 1277934 27.132287 27.13229 6.891601 0.4046188
## Theil's U
## Training set NA
## Test set 3.006082
mean_rmse <- accuracy(mean_forecast, test)["Test set", "RMSE"]
#Holt-winters
hw_forecast <- hw(train, lambda=la, h = S)
accuracy(hw_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6597.618 124543.9 86156.64 0.001745751 2.950183 0.4646227
## Test set 155765.581 281194.6 227025.20 2.999212155 4.830478 1.2242941
## ACF1 Theil's U
## Training set 0.06367737 NA
## Test set 0.46088394 0.6087398
mean_rmse <- accuracy(hw_forecast, test)["Test set", "RMSE"]
#Selecting the best ETS model
tr_best_ets <- ets(train, model = "ZZZ", ic = c("aicc", "aic", "bic"), lambda=la)
summary(tr_best_ets)
## ETS(A,A,A)
##
## Call:
## ets(y = train, model = "ZZZ", lambda = la, ic = c("aicc", "aic",
## "bic"))
##
## Box-Cox transformation: lambda= 1.1277
##
## Smoothing parameters:
## alpha = 0.5138
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 14704739.8335
## b = 34598.6949
## s = -471688.8 -447220.1 754257.6 -1679971 1164133 2006007
## 1518944 1611110 556299 939460 -3239083 -2712248
##
## sigma: 861643.9
##
## AIC AICc BIC
## 7082.407 7085.498 7139.786
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 6593.812 124543.6 86159.68 0.001676557 2.950283 0.4646391
## ACF1
## Training set 0.06342413
ETS model and testing set accuracy
ets_forecast <- forecast(tr_best_ets, h = S)
accuracy(ets_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6593.812 124543.6 86159.68 0.001676557 2.950283 0.4646391
## Test set 155753.047 281186.5 227017.21 2.998939771 4.830318 1.2242510
## ACF1 Theil's U
## Training set 0.06342413 NA
## Test set 0.46087904 0.6087214
ets_rmse <- accuracy(ets_forecast, test)["Test set", "RMSE"]
Visualizing Best ETS Forecast
autoplot(train) + # windowing so we can see the prediction more clearly
autolayer(ets_forecast, series = "ETS(A, A, A)", PI = F) +
autolayer(test, series = "Test Set") +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("Best ETS Model Forecast") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
In the plot we can see that the forecast has a similar pattern to our
test set.
#ARIMA models
*Checking if the data is stationary** Base on observation the times series is clearly not stationary, let’s check statistically.
summary(ur.kpss(train))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 4.1306
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
Since the value of test statistic 4.1306 > 0.463 the value under 5% means this ts is NOT stationary.
Turning our times series stationary
Checking how many seasonal differencing are need in order to make a time series stationary.
nsdiffs(log(train))
## [1] 1
We run a kpss test on the seasonally differenced series.
kpss_results <- ur.kpss(diff(train, lag=frequency(train)))
summary(kpss_results)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 0.1466
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
Since the value of test statistic 0.1466 < 0.463 the value under 5% means this ts is stationary after the first seasonal differencing.
ggtsdisplay(diff(train, lag = frequency(train)))
ACF Plot (Bottom Left Panel): There are significant autocorrelations at initial lags (notably at lag 1 through 7), which gradually decay, indicating a possible AR (autoregressive) process. A notable spike at lag 12 suggests a potential yearly seasonal effect, as this autocorrelation persists over a significant period.
PACF Plot (Bottom Right Panel): A significant spike at lag 1 indicates a strong AR(1) component is likely present in the data. Spikes at lags 12 and 24 suggest seasonal effects that may require a seasonal ARIMA model to account for yearly patterns observed at consistent intervals.
Based on these findings, the data likely requires a Seasonal ARIMA model that includes non-seasonal AR(1) terms and additional seasonal terms to adequately model the observed
#AUTO ARIMA MODELS
ARIMA 1:
We use auto.arima() to pick the best one among all arima models with variations in the penalty.
arima1 <- auto.arima(train, lambda=la, seasonal = TRUE, ic="aic", stepwise = FALSE, d=0, D=1)
arima1
## Series: train
## ARIMA(1,0,1)(2,1,1)[12] with drift
## Box Cox transformation: lambda= 1.127672
##
## Coefficients:
## ar1 ma1 sar1 sar2 sma1 drift
## 0.8711 -0.3289 0.0403 -0.0508 -0.7172 69592.891
## s.e. 0.0492 0.0974 0.1279 0.1104 0.1071 8091.438
##
## sigma^2 = 6.983e+11: log likelihood = -3072.79
## AIC=6159.57 AICc=6160.15 BIC=6182.8
The model captures both non-seasonal and seasonal dynamics in the data, adjusting for trends (via drift) and transforming the series to stabilize variance. The coefficients’ significance suggests that the lags and error terms selected are important in explaining the variability in the data.
arima_forecast <- forecast(arima1, h = S)
accuracy(arima_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set -3818.985 119641.0 83389.97 -0.3545059 2.788057 0.4497027
## Test set 94407.155 208578.9 164854.33 1.7777714 3.547351 0.8890211
## ACF1 Theil's U
## Training set 0.03036219 NA
## Test set 0.34232132 0.4495678
arima1_rmse <- accuracy(arima_forecast, test)["Test set", "RMSE"]
Indeed, the auto.arima function chose the AR model with the seasonal component.
autoplot(train) +
autolayer(arima_forecast, series = "ARIMA Forecast", PI=F) +
autolayer(test, series = "Test Set") +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("ARIMA(1,0,1)(2,1,1)[12] with drift ") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ARIMA 2: Changing the ic criterion and using a the
approximation to FALSE, forcing the function to use a slower but more
accurate fitting method.
arima2 <- auto.arima(train, lambda=la, seasonal = TRUE, ic="bic", stepwise = FALSE,
d=0, D=1, approximation = FALSE)
arima2
## Series: train
## ARIMA(1,0,1)(0,1,1)[12] with drift
## Box Cox transformation: lambda= 1.127672
##
## Coefficients:
## ar1 ma1 sma1 drift
## 0.8789 -0.3439 -0.7119 69144.818
## s.e. 0.0454 0.0929 0.0569 8554.303
##
## sigma^2 = 6.941e+11: log likelihood = -3073.08
## AIC=6156.15 AICc=6156.46 BIC=6172.74
arima2_forecast <- forecast(arima2, h = S)
accuracy(arima2_forecast, test)
## ME RMSE MAE MPE MAPE MASE
## Training set -3672.582 119829.8 84685.9 -0.3494371 2.827757 0.4566914
## Test set 103649.526 213914.6 171591.7 1.9817717 3.690443 0.9253541
## ACF1 Theil's U
## Training set 0.02990231 NA
## Test set 0.33335629 0.4624915
arima2_rmse <- accuracy(arima2_forecast, test)["Test set", "RMSE"]
autoplot(train) +
autolayer(arima2_forecast, series = "ARIMA Forecast", PI=F) +
autolayer(test, series = "Test Set") +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("ARIMA(1,0,1)(0,1,1)[12] with drift ic = bic and approximation = F") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Choosing a final model for Forecasting
rmses <- data.frame(mean_rmse, naive_rmse, snaive_rmse, drift_rmse, ets_rmse, arima1_rmse, arima2_rmse)
rmses
The first arima model has the lowest testing RMSE. The candidates are: 1. arima1 2. arima2 3. ets
Let’s see all the models in a plot
autoplot(window(train, start = c(2017,1))) +
autolayer(naive_forecast, series = "Naive", PI = F) +
autolayer(snaive_forecast, series = "Seasonal Naive", PI = F) +
autolayer(drift_forecast, series = "Drift", PI = F) +
autolayer(mean_forecast, series = "Average", PI = F) +
autolayer(hw_forecast, series = "Holt winters", PI = F) +
autolayer(ets_forecast, series = "Best ETS", PI = F) +
autolayer(arima_forecast, series = "ARIMA1", PI = F) +
autolayer(arima2_forecast, series = "ARIMA2", PI = F) +
autolayer(test, series = "testing")+
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("Forecast Model Comparison zoom from January 2017") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Checking residuals of these models
L <- min(ceiling(T/5), 2*frequency(train))
checkresiduals(arima1, lag = L)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(2,1,1)[12] with drift
## Q* = 29.633, df = 19, p-value = 0.05666
##
## Model df: 5. Total lags used: 24
checkresiduals(arima2, lag = L)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(0,1,1)[12] with drift
## Q* = 30.634, df = 21, p-value = 0.07997
##
## Model df: 3. Total lags used: 24
checkresiduals(tr_best_ets, lag = L)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,A,A)
## Q* = 44.125, df = 24, p-value = 0.007378
##
## Model df: 0. Total lags used: 24
#Cross validation in our models
arima1_cv <- function(y, h) {
# Fit an ARIMA model to the data segment 'y'
model <- Arima(y, order = c(1, 0, 1), seasonal = c(2, 1, 1))
# Return the forecast over a horizon 'h'
return(forecast(model, h = h))
}
# Perform cross-validation
arima1_cv_results <- tsCV(ts_df, arima1_cv, h = S)
# Calculate RMSE from the cross-validation results
arima1_cv_rmse <- sqrt(mean(arima1_cv_results^2, na.rm = TRUE))
arima2_cv <- function(y, h) {
# Fit an ARIMA model to the data segment 'y'
model <- Arima(y, order = c(1, 0, 1), seasonal = c(0, 1, 1))
# Return the forecast over a horizon 'h'
return(forecast(model, h = h))
}
# Perform cross-validation
arima2_cv_results <- tsCV(ts_df, arima2_cv, h = S)
# Calculate RMSE from the cross-validation results
arima2_cv_rmse <- sqrt(mean(arima2_cv_results^2, na.rm = TRUE))
ets_cv <- function(y, h){
model <- ets(y, model="AAA")
return(forecast(model, h = h))
}
ets_cv_results <- tsCV(ts_df, ets_cv, h = S)
ets_cv_rmse <- sqrt(mean(ets_cv_results^2, na.rm = TRUE))
cv_rmses <- data.frame(arima1_cv_rmse, arima2_cv_rmse, ets_cv_rmse)
cv_rmses
##MODEL SELECTED
arima1 is the model selected, base on:
-Alignment with Course Methods: ARIMA1 uses default settings like approximation=TRUE, aligning with methodologies taught in class.
-Good In-Sample Performance: ARIMA1 shows superior in-sample RMSE, indicating a better fit to training data.
-Competitive Cross-Validation Performance: Though ARIMA1’s cross-validation RMSE is slightly higher than ARIMA2’s, the difference is minimal. This closeness in performance suggests that ARIMA1 generalizes adequately to unseen data.
-Practicality and Computational Efficiency: The use of approximation=TRUE speeds up the model fitting process, especially beneficial for large datasets.This approach reduces computational demands without substantially compromising accuracy.
autoplot(train) +
autolayer(forecast(arima1, h = S), series = "ARIMA(1,0,1)(2,1,1)[12] with drift", PI = F) +
autolayer(test, series = "Test Set") +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("Final Model Forecast comparing with the Test set") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Fit the ARIMA model
arima_final <- Arima(ts_df, order=c(1,0,1), seasonal=c(2,1,1), include.drift=TRUE, lambda=la)
# Generate forecasts for the first half of 2020
future_forecast <- forecast(arima_final, h=6)
# Plotting the forecast along with the training data
# Since there is no actual test data for 2020, we only plot the forecast
autoplot(ts_df) +
autolayer(future_forecast, series = "Forecast for First Half of 2020", PI = FALSE) +
xlab("Year") +
ylab("Number of Passengers") +
ggtitle("Forecast for the First Half of 2020") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
forecast_table <- data.frame(
Month = seq(as.Date("2020-01-01"), by = "month", length.out = 6),
Forecasted_Passengers = round(future_forecast$mean),
Lower_95 = round(future_forecast$lower[,2]),
Upper_95 = round(future_forecast$upper[,2])
)
print(forecast_table)
## Month Forecasted_Passengers Lower_95 Upper_95
## 1 2020-01-01 4567622 4322334 4811238
## 2 2020-02-01 4389766 4117390 4659999
## 3 2020-03-01 5126686 4840709 5410640
## 4 2020-04-01 5025187 4723591 5324489
## 5 2020-05-01 5282171 4970933 5591084
## 6 2020-06-01 5169308 4847909 5488174
The ARIMA model specified is \(ARIMA(1,0,1)(2,1,1)[12]\) with drift. A Box Cox transformation is applied with \(\lambda = 1.127672\).
Selected Model Equation
\[ \Delta^{1} \Delta_{12}^{1} y_t' = \epsilon_t + 0.8711 \Delta y_{t-1}' - 0.3289 \epsilon_{t-1} + 0.0403 \Delta_{12} y_{t-12}' - 0.0508 \Delta_{12} y_{t-24}' - 0.7172 \epsilon_{t-12} + 69592.891 \cdot t \]
#Creating a new data frame just for take the geolocations
#city_names <- df %>%
# select(origin_city_name, dest_city_name) %>%
# unlist() %>%
# unique()
# Create a new dataframe with the unique city names
#new_df <- data.frame(city_name = city_names)
# Print the new dataframe to see the result
#print(new_df)
#Using API google maps to take the geolocations
#install.packages("ggmap")
#library(ggmap)
#register_google(key = "Replace Here")
# Geocode each city to find latitude and longitude
#new_df$city_name <- as.character(new_df$city_name)
#coords <- geocode(new_df$city_name, output = "latlona", source = "google")
# Check for any geocoding errors
#if (any(is.na(coords$lon) | is.na(coords$lat))) {
# warning("Some coordinates were not found")
#}
# Add coordinates to the original dataframe
#new_df <- bind_cols(new_df, coords[, c("lon", "lat")])
#new_df
#Writing the data frame with geocodes that is no necessary another call of the API google
# Write the new_df to a CSV file in the Data folder
#write.csv(new_df, "Data/geocodes_df.csv", row.names = FALSE)
geocodes <- read.csv("Data/geocodes_df.csv")
Pasting geolocations in our data set, the data is divided in 2 data set base on if the flights are coming from or going to Charlotte, NC or Raleigh/Durham, NC.
colnames(geocodes)
## [1] "city_name" "lon" "lat"
# Flights originating from Charlotte or Raleigh
flights_from_char_raleigh <- df %>%
filter(origin_city_name %in% c("Charlotte, NC", "Raleigh/Durham, NC"))
# Flights destined for Charlotte or Raleigh
flights_to_char_raleigh <- df %>%
filter(dest_city_name %in% c("Charlotte, NC", "Raleigh/Durham, NC"))
flights_from_char_raleigh <- flights_from_char_raleigh %>%
left_join(geocodes, by = c("dest_city_name" = "city_name"))
flights_to_char_raleigh <- flights_to_char_raleigh %>%
left_join(geocodes, by = c("origin_city_name" = "city_name"))
Checking Final Data sets for create the map.
colnames(flights_from_char_raleigh)
## [1] "sum_passengers" "airline_id" "carrier_name"
## [4] "origin" "origin_city_name" "origin_state_abr"
## [7] "origin_state_nm" "origin_country" "origin_country_name"
## [10] "dest" "dest_city_name" "dest_state_abr"
## [13] "dest_state_nm" "dest_country" "dest_country_name"
## [16] "year" "month" "lon"
## [19] "lat"
colnames(flights_to_char_raleigh)
## [1] "sum_passengers" "airline_id" "carrier_name"
## [4] "origin" "origin_city_name" "origin_state_abr"
## [7] "origin_state_nm" "origin_country" "origin_country_name"
## [10] "dest" "dest_city_name" "dest_state_abr"
## [13] "dest_state_nm" "dest_country" "dest_country_name"
## [16] "year" "month" "lon"
## [19] "lat"
# Checking NaN values in specific longitude and latitude columns
missing_from_lo <- sum(is.na(flights_from_char_raleigh$lon))
missing_from_la <- sum(is.na(flights_from_char_raleigh$lat))
missing_to_lo <- sum(is.na(flights_to_char_raleigh$lon))
missing_to_la <- sum(is.na(flights_to_char_raleigh$lat))
# Print the number of missing values for each column
cat("Missing values from Charlotte and Raleigh: ", missing_from_lo, "\n")
## Missing values from Charlotte and Raleigh: 0
cat("Missing values from Charlotte and Raleigh: ", missing_from_la, "\n")
## Missing values from Charlotte and Raleigh: 0
cat("Missing values to Charlotte and Raleigh: ", missing_to_lo, "\n")
## Missing values to Charlotte and Raleigh: 0
cat("Missing values to Charlotte and Raleigh: ", missing_to_la, "\n")
## Missing values to Charlotte and Raleigh: 0
Write the data frames to CSV file in the Data folder to make interactive map in shiny
#write.csv(flights_from_char_raleigh, "Data/from_char_raleigh.csv", row.names = FALSE)
#write.csv(flights_to_char_raleigh, "Data/to_char_raleigh.csv", row.names = FALSE)
Creating sf objects for future plots
#install.packages("sf")
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
usa_sf <- st_read("Data/sf/cb_2018_us_state_5m.shp")
## Reading layer `cb_2018_us_state_5m' from data source
## `C:\Users\ipm36\OneDrive\Transcribed Files\notes_quiz4_files\Documents\OneDrive\Documents\GitHub\AirPassenger\Data\sf\cb_2018_us_state_5m.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1473 ymin: -14.55255 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS: NAD83
# Create an sf object for flights coming from Charlotte, NC or Raleigh/Durham, NC
flights_from_char_ral_sf <- st_as_sf(flights_from_char_raleigh, coords = c("lon", "lat"),
crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
# Create an sf object for flights going to Charlotte, NC or Raleigh/Durham, NC
flights_to_char_ral_sf <- st_as_sf(flights_to_char_raleigh, coords = c("lon", "lat"),
crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
##Maps
ggplot() +
geom_sf(data = usa_sf, fill = "white", color = "gray") +
geom_sf(data = flights_to_char_ral_sf, aes(color = dest_city_name, size = sum_passengers)) +
scale_color_manual(values = c("Charlotte, NC" = "blue4", "Raleigh/Durham, NC" = "red3")) +
scale_size(range = c(0.5, 3), breaks = c(500, 5000, 50000), labels = c("500", "5K", "50K")) +
labs(title = "Flight Traffic from other cities to Charlotte and Raleigh, NC") +
coord_sf(xlim = c(-160, -60), ylim = c(10, 70))
flight_counts <- table(flights_to_char_ral_sf$origin_state_abr)
flight_counts
##
## AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS
## 32 1762 730 1227 4108 1378 1323 35 9930 3796 48 347 105 3482 2336 107
## KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM
## 3510 1458 1970 1760 559 2991 1948 2834 917 162 7579 79 463 753 2838 287
## NV NY OH OK OR PA PR RI SC SD TN TX UT VA VI VT
## 1017 8088 4519 615 367 5542 503 1035 4084 61 5109 5826 604 7507 383 169
## WA WI WV WY
## 729 1056 906 10
usa_sf2 <- st_read("Data/sf/cb_2018_us_state_5m.shp")
## Reading layer `cb_2018_us_state_5m' from data source
## `C:\Users\ipm36\OneDrive\Transcribed Files\notes_quiz4_files\Documents\OneDrive\Documents\GitHub\AirPassenger\Data\sf\cb_2018_us_state_5m.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1473 ymin: -14.55255 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS: NAD83
# Convert the flight_counts table to a data frame for easier merging
flight_counts_df <- as.data.frame(flight_counts)
names(flight_counts_df) <- c("STUSPS", "flightCounts")
# Merge with the spatial data frame
usa_sf2 <- merge(usa_sf2, flight_counts_df, by = "STUSPS", all.x = TRUE)
# Replace NA values with 0 in flightCounts if any state has no flights
usa_sf2$flightCounts[is.na(usa_sf2$flightCounts)] <- 0
# Plot using geom_sf
ggplot(data = usa_sf2) +
geom_sf(aes(fill = flightCounts)) +
scale_fill_gradient(low = "white", high = "green4", na.value = "grey", name = "Flight Counts") +
labs(title = "Flight Counts to Charlotte and Raleigh per state", fill = "Flight Counts") +
coord_sf(xlim = c(-130, -60), ylim = c(20, 60))+
theme_minimal()
ggplot() +
geom_sf(data = usa_sf, fill = "white", color = "gray") +
geom_sf(data = flights_from_char_ral_sf, aes(color = origin_city_name, size = sum_passengers)) +
scale_color_manual(values = c("Charlotte, NC" = "blue4", "Raleigh/Durham, NC" = "red3")) +
scale_size(range = c(0.5, 3), breaks = c(500, 5000, 50000), labels = c("500", "5K", "50K")) +
labs(title = "Flight Traffic from Charlotte and Raleigh, NC to other cities") +
coord_sf(xlim = c(-160, -60), ylim = c(10, 70))
flight_counts2 <- table(flights_from_char_ral_sf$dest_state_abr)
flight_counts2
##
## AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS
## 22 1857 757 1146 4231 1363 1299 40 9617 3912 106 330 124 3451 2559 123
## KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM
## 3642 1636 2039 1787 556 3102 2058 2984 919 119 7517 49 437 762 2868 337
## NV NY OH OK OR PA PR RI SC SD TN TX UT VA VI VT
## 1036 8163 4759 654 381 5672 495 1071 4007 70 5260 6070 654 7551 371 176
## WA WI WV WY
## 835 1162 918 10
usa_sf3 <- st_read("Data/sf/cb_2018_us_state_5m.shp")
## Reading layer `cb_2018_us_state_5m' from data source
## `C:\Users\ipm36\OneDrive\Transcribed Files\notes_quiz4_files\Documents\OneDrive\Documents\GitHub\AirPassenger\Data\sf\cb_2018_us_state_5m.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1473 ymin: -14.55255 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS: NAD83
# Convert the flight_counts table to a data frame for easier merging
flight_counts_df2 <- as.data.frame(flight_counts2)
names(flight_counts_df2) <- c("STUSPS", "flightCounts2")
# Merge with the spatial data frame
usa_sf3 <- merge(usa_sf3, flight_counts_df2, by = "STUSPS", all.x = TRUE)
# Replace NA values with 0 in flightCounts if any state has no flights
usa_sf3$flightCounts2[is.na(usa_sf3$flightCounts2)] <- 0
# Plot using geom_sf
ggplot(data = usa_sf3) +
geom_sf(aes(fill = flightCounts2)) +
scale_fill_gradient(low = "white", high = "green4", na.value = "grey", name = "Flight Counts") +
labs(title = "Flight Counts from Charlotte and Raleigh per state", fill = "Flight Counts") +
coord_sf(xlim = c(-130, -60), ylim = c(20, 60))+
theme_minimal()